R Programming Presentation

Daniel Dean, Jessica Nunez, Erin Wall, Chayou Zhai

12/5/2019

Introduction:

Jeopardy has been running for 35 seasons. In this project our task was to create a question, and later use R Programming to manipulate raw show data to answer our question. Our raw data included questions, answers, daily double, air date, and notes. We were excited to get more experience in mapping in R, and this seemed like a good opportunity.

Research Questions:

Which countries are most frequently-mentioned in Jeopardy, and how does that relate to the frequency of daily doubles?

Is there a correlation between land area and mentions in Jeopardy?

Is there a correlation between land area or GDP and mentions in Jeopardy?

Methods:

Data Processing

Mapping

Statistics

Results:

Frequency

Is there a correlation between land area and mentions in Jeopardy?

Land Area and Frequency:

## Joining, by = "ISO3V10"
## Warning: Column `ISO3V10` joining factor and character vector, coercing into
## character vector
## Warning: Ignoring unknown aesthetics: frame, label
## Warning: Ignoring unknown aesthetics: frame

GDP and Frequency:

## Warning: Ignoring unknown aesthetics: frame, label
## Warning: Ignoring unknown aesthetics: frame

Daily Doubles

## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
## rgeos version: 0.5-2, (SVN revision 621)
##  GEOS runtime version: 3.6.1-CAPI-1.10.1 
##  Linking to sp version: 1.3-2 
##  Polygon checking: TRUE
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## Parsed with column specification:
## cols(
##   round = col_double(),
##   value = col_double(),
##   daily_double = col_character(),
##   category = col_character(),
##   comments = col_character(),
##   answer = col_character(),
##   question = col_character(),
##   air_date = col_date(format = ""),
##   notes = col_character()
## )
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
## Joining, by = c("round", "value", "daily_double", "category", "comments", "answer", "question", "air_date", "notes", "country", "iso3", "type")
##       rn        x          y
##   1: ABW -6565487  1339133.0
##   2: AFG  5901773  3616680.3
##   3: AGO  1644700 -1313636.8
##   4: AIA -5869637  1949085.5
##   5: ALB  1735387  4397165.1
##  ---                        
## 240: ZAF  2283373 -3098919.3
## 241: ZMB  2603114 -1438722.5
## 242: ZWE  2774634 -2032081.4
## 243: TUV 16815547  -829881.9
## 244: GUF -5026000   419853.4
## Warning: Ignoring unknown aesthetics: text

Conclusion

Tutorial:

Data Manipulation

Next Steps . . .

Next Steps Continued . . .

Matching Names

Finally . . .

Mapping:

Leaflet Package

library(tidyverse)
library(rworldmap)
library(sf)
library(leaflet)
library("dplyr")
library(viridisLite)
library(janitor)

  country_data_all <-read_csv("country_all_iso_all.csv", )

countriesLow <- countriesLow %>%
  st_as_sf

#Temporarily removing air date (not sure how to animate/facet/etc. in Leaflet)

country_geom_full<- country_data_all %>%
  left_join(countryExData, by = c("iso3" = "ISO3V10")) %>%
  group_by(iso3) %>%
  mutate(mean_value = mean(value)) %>%
  add_tally(name = "count") %>%
  ungroup() %>%
  select(country, count, mean_value, iso3) %>%
  distinct() %>%
  mutate(iso3 = toupper(iso3)) %>%
  rename(ISO3 = iso3)

country_geom_map_data <- country_geom_full %>%
  mutate(ISO3 = as.factor(ISO3)) %>%
  dplyr::full_join(countriesLow) %>%
  clean_names() %>%
  st_as_sf

pal <- colorNumeric(
  palette = "Greens",
  domain = country_geom_map_data$count)

popup_info<- paste0("<b>Country:</b> ",
                    country_geom_map_data$name, "<br/>",
                    "<b>Population:</b>",
                    country_geom_map_data$pop_est, "<br/>",
                    "<b>Count:</b>",
                    country_geom_map_data$count, "<br/>",
                    "<b>Mean Value:</b> ",
                    round(country_geom_map_data$mean_value))




leaflet(country_geom_map_data) %>%
  addTiles() %>%
  addPolygons(color = ~pal(count), popup = popup_info)

### Plotly

library(readr)
library(tidyverse)
library(dplyr)
library(rworldmap)
library(rvest)
library(tidyr)
library(janitor)
library(ggplot2)
library(rgeos)
library(data.table)
library(lubridate)
library(ggthemes)
library(plotly)
library(viridis)

data("countryExData")
data("countryRegions")
data("countrySynonyms")

# load denonyms
webpage <- read_html("https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations")


# upload all seasons data
jeopardy_all <- read_tsv("master_season1-35.tsv")

# filter data
daily_double_all <- jeopardy_all %>% 
  filter(daily_double == "yes")

# demonyms
table <- webpage %>% 
  html_nodes("table") %>% 
  html_table(header = F)
table <- table[[1]]

names(table) = table[1,]
table <- table %>% 
  slice(-1) %>% 
  clean_names()

# converting country synonyms to full list - one ob for each adjectival/demonym
demonym_table <- table %>% 
  as.tibble() %>% 
  mutate(country_entity_name = str_replace(country_entity_name, "\\[.\\]", ""),
         adjectivals = str_replace(adjectivals, "\\[.\\]", ""),
         demonyms = str_replace(demonyms, "\\[.\\]", "")) %>% 
  separate_rows(adjectivals, sep = ",\\s|/|\\sor\\s") %>% 
  separate_rows(demonyms, sep = ",\\s|/|\\sor\\s")

countrySynonyms_full <- countrySynonyms %>% 
  pivot_longer(name1:name8, names_to = "name", values_to = "country") %>% 
  filter(!is.na(country) & country != "") %>% 
  drop_na()

country_names_full <- countrySynonyms_full %>% 
  select(-c(name, ID)) %>% 
  left_join(demonym_table, by = c("country" = "country_entity_name")) %>% 
  pivot_longer(country:demonyms, names_to = "name_type", values_to = "names") %>% 
  select(-name_type) %>% 
  distinct() %>% 
  clean_names() %>% 
  drop_na() %>% 
  filter(iso3 != "")

# filter answers & questions that have countries mentioned
country_answers_all <-  daily_double_all %>%
  filter(str_detect(string = answer, pattern = paste0(paste(
    country_names_full$names, collapse = "|"),"[^a-z]")) ) %>%
  mutate(country_a = str_extract_all(string = answer, pattern = paste0(
    paste(country_names_full$names, collapse = "|"),"[^a-z]"))) %>% 
  unnest(country_a)

country_questions_all <-  daily_double_all %>%
  filter(str_detect(string = answer, pattern = paste0(paste(
    country_names_full$names, collapse = "|"),"[^a-z]")) ) %>%
  mutate(country_q = str_extract_all(string = question, pattern = paste0(
    paste(country_names_full$names, collapse = "|"),"[^a-z]"))) %>%
  unnest(country_q)

# joining iso codes
country_answers_iso_all <- country_answers_all %>%
  left_join(country_names_full, by = c("country_a" = "names")) %>%
  rename(country = country_a) %>%
  mutate(type = rep("answer", nrow(.)))

country_questions_iso_all <- country_questions_all %>%
  left_join(country_names_full, by = c("country_q" = "names")) %>%
  rename(country = country_q)%>%
  mutate(type = rep("question", nrow(.)))

# one last bit of cleaning to get data required for plotting
country_all_iso_allszn <- full_join(country_answers_iso_all, 
                                    country_questions_iso_all) %>%
  filter(!(category == "AMERICAN INDIANS" & iso3 == "ind"),
         !(iso3 %in% c("iot", "atf"))) %>% 
  mutate(iso3 = toupper(iso3))


# map creation
wmap <- getMap(resolution = "low")
wmap <- spTransform(wmap, CRS("+proj=robin"))
# get centroids
centroids <- gCentroid(wmap, byid = TRUE, id = wmap@data$ISO3)
centroids <- data.frame(centroids)
setDT(centroids, keep.rownames = TRUE)[]
##       rn        x          y
##   1: ABW -6565487  1339133.0
##   2: AFG  5901773  3616680.3
##   3: AGO  1644700 -1313636.8
##   4: AIA -5869637  1949085.5
##   5: ALB  1735387  4397165.1
##  ---                        
## 240: ZAF  2283373 -3098919.3
## 241: ZMB  2603114 -1438722.5
## 242: ZWE  2774634 -2032081.4
## 243: TUV 16815547  -829881.9
## 244: GUF -5026000   419853.4
setnames(centroids, "rn", "country_iso3c")

countrySynonyms_full <- countrySynonyms_full %>% 
  mutate_all(toupper)

all_country_iso <- country_all_iso_allszn %>% 
  mutate(date = ymd(air_date)) %>% 
  mutate(year = year(date)) %>% 
  select(c(round, value, daily_double, answer, question, country, iso3,
           type, date, year)) %>% 
  group_by(iso3) %>% 
  summarize(season_count = n()) %>% 
  left_join(countrySynonyms_full, by = c('iso3' = 'ISO3')) %>%  
  filter(name == "NAME1") %>% 
  mutate(country = str_to_title(country))

all_country_iso$hover = with(all_country_iso, paste(country, '<br>',
                                                    "Total:", season_count))


# join new data set to map
wmap_df <- fortify(wmap, region = "ISO3")
wmap_df <- left_join(wmap_df, all_country_iso, by = c('id' = 'iso3'))
wmap_df <- left_join(wmap_df, centroids, by = c('id' = 'country_iso3c'))


# plotly
p <- ggplot(data = wmap_df) +
  geom_polygon(aes(x = long, y = lat, group = group, fill = season_count,
                   text = hover)) +
  labs(fill = "Number of mentions") +
  theme_map() +
  scale_fill_viridis_c()

plotly <- ggplotly(p, tooltip = "text") %>% 
  layout(title = list(text = paste0('Number of country mentions',
                                    '<br>',
                                    '<sup>',
                                    'Daily Doubles, seasons 1-35',
                                    '</sup>')))

plotly

Lessons Learned